Earthquake Prediction Project¶
Importing the necessary libraries¶
In [30]:
import numpy as np
import pandas as pd
import requests
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import scatter_matrix
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import time
Linking the csv dataset file¶
In [31]:
df = pd.read_csv(r'C:\Users\arjun\OneDrive\Desktop\2023 Sept Fall 3rd sem\Predictive Analytics\Earthquake_dataset_Arjun\earthquake1.csv')
In [32]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 24007 entries, 0 to 24006 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 id 24007 non-null float64 1 date 24007 non-null object 2 time 24007 non-null object 3 lat 24007 non-null float64 4 long 24007 non-null float64 5 country 24007 non-null object 6 city 11754 non-null object 7 area 12977 non-null object 8 direction 10062 non-null object 9 dist 10062 non-null float64 10 depth 24007 non-null float64 11 xm 24007 non-null float64 12 md 24007 non-null float64 13 richter 24007 non-null float64 14 mw 5003 non-null float64 15 ms 24007 non-null float64 16 mb 24007 non-null float64 dtypes: float64(11), object(6) memory usage: 3.1+ MB
In [33]:
df.describe()
Out[33]:
| id | lat | long | dist | depth | xm | md | richter | mw | ms | mb | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.400700e+04 | 24007.000000 | 24007.000000 | 10062.000000 | 24007.000000 | 24007.000000 | 24007.000000 | 24007.000000 | 5003.000000 | 24007.000000 | 24007.000000 |
| mean | 1.991982e+13 | 37.929474 | 30.773229 | 3.175015 | 18.491773 | 4.056038 | 1.912346 | 2.196826 | 4.478973 | 0.677677 | 1.690561 |
| std | 2.060396e+11 | 2.205605 | 6.584596 | 4.715461 | 23.218553 | 0.574085 | 2.059780 | 2.081417 | 1.048085 | 1.675708 | 2.146108 |
| min | 1.910000e+13 | 29.740000 | 18.340000 | 0.100000 | 0.000000 | 3.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 |
| 25% | 1.980000e+13 | 36.190000 | 26.195000 | 1.400000 | 5.000000 | 3.600000 | 0.000000 | 0.000000 | 4.100000 | 0.000000 | 0.000000 |
| 50% | 2.000000e+13 | 38.200000 | 28.350000 | 2.300000 | 10.000000 | 3.900000 | 0.000000 | 3.500000 | 4.700000 | 0.000000 | 0.000000 |
| 75% | 2.010000e+13 | 39.360000 | 33.855000 | 3.600000 | 22.400000 | 4.400000 | 3.800000 | 4.000000 | 5.000000 | 0.000000 | 4.100000 |
| max | 2.020000e+13 | 46.350000 | 48.000000 | 95.400000 | 225.000000 | 7.900000 | 7.400000 | 7.200000 | 7.700000 | 7.900000 | 7.100000 |
In [34]:
df.shape
Out[34]:
(24007, 17)
In [35]:
df.head()
Out[35]:
| id | date | time | lat | long | country | city | area | direction | dist | depth | xm | md | richter | mw | ms | mb | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.000000e+13 | 2003.05.20 | 12:17:44 AM | 39.04 | 40.38 | turkey | bingol | baliklicay | west | 0.1 | 10.0 | 4.1 | 4.1 | 0.0 | NaN | 0.0 | 0.0 |
| 1 | 2.010000e+13 | 2007.08.01 | 12:03:08 AM | 40.79 | 30.09 | turkey | kocaeli | bayraktar_izmit | west | 0.1 | 5.2 | 4.0 | 3.8 | 4.0 | NaN | 0.0 | 0.0 |
| 2 | 1.980000e+13 | 1978.05.07 | 12:41:37 AM | 38.58 | 27.61 | turkey | manisa | hamzabeyli | south_west | 0.1 | 0.0 | 3.7 | 0.0 | 0.0 | NaN | 0.0 | 3.7 |
| 3 | 2.000000e+13 | 1997.03.22 | 12:31:45 AM | 39.47 | 36.44 | turkey | sivas | kahvepinar_sarkisla | south_west | 0.1 | 10.0 | 3.5 | 3.5 | 0.0 | NaN | 0.0 | 0.0 |
| 4 | 2.000000e+13 | 2000.04.02 | 12:57:38 AM | 40.80 | 30.24 | turkey | sakarya | meseli_serdivan | south_west | 0.1 | 7.0 | 4.3 | 4.3 | 0.0 | NaN | 0.0 | 0.0 |
In [36]:
df.columns
Out[36]:
Index(['id', 'date', 'time', 'lat', 'long', 'country', 'city', 'area',
'direction', 'dist', 'depth', 'xm', 'md', 'richter', 'mw', 'ms', 'mb'],
dtype='object')
Data Preprocessing
In [37]:
df = df.drop('id',axis=1)
In [38]:
import datetime
import time
import pandas as pd
timestamp = []
for d, t in zip(df['date'], df['time']):
try:
ts = datetime.datetime.strptime(d+' '+t, '%Y.%m.%d %I:%M:%S %p')
timestamp.append(time.mktime(ts.timetuple()))
except OverflowError:
timestamp.append(None) # or any other value that indicates an error
timeStamp = pd.Series(timestamp)
df['Timestamp'] = timeStamp.values
final_data = df.drop(['date', 'time'], axis=1)
df = final_data
df = df.dropna(subset=['Timestamp']) # Drop rows with invalid timestamps
df.head()
Out[38]:
| lat | long | country | city | area | direction | dist | depth | xm | md | richter | mw | ms | mb | Timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 39.04 | 40.38 | turkey | bingol | baliklicay | west | 0.1 | 10.0 | 4.1 | 4.1 | 0.0 | NaN | 0.0 | 0.0 | 1.053401e+09 |
| 1 | 40.79 | 30.09 | turkey | kocaeli | bayraktar_izmit | west | 0.1 | 5.2 | 4.0 | 3.8 | 4.0 | NaN | 0.0 | 0.0 | 1.185937e+09 |
| 2 | 38.58 | 27.61 | turkey | manisa | hamzabeyli | south_west | 0.1 | 0.0 | 3.7 | 0.0 | 0.0 | NaN | 0.0 | 3.7 | 2.633605e+08 |
| 3 | 39.47 | 36.44 | turkey | sivas | kahvepinar_sarkisla | south_west | 0.1 | 10.0 | 3.5 | 3.5 | 0.0 | NaN | 0.0 | 0.0 | 8.590015e+08 |
| 4 | 40.80 | 30.24 | turkey | sakarya | meseli_serdivan | south_west | 0.1 | 7.0 | 4.3 | 4.3 | 0.0 | NaN | 0.0 | 0.0 | 9.546479e+08 |
In [39]:
df.dtypes
Out[39]:
lat float64 long float64 country object city object area object direction object dist float64 depth float64 xm float64 md float64 richter float64 mw float64 ms float64 mb float64 Timestamp float64 dtype: object
In [12]:
# Data Encoding
label_encoder = preprocessing.LabelEncoder()
for col in df.columns:
if df[col].dtype == 'object':
label_encoder.fit(df[col])
df[col] = label_encoder.transform(df[col])
df.dtypes
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[col] = label_encoder.transform(df[col]) C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[col] = label_encoder.transform(df[col]) C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[col] = label_encoder.transform(df[col]) C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1335032553.py:6: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[col] = label_encoder.transform(df[col])
Out[12]:
lat float64 long float64 country int32 city int32 area int32 direction int32 dist float64 depth float64 xm float64 md float64 richter float64 mw float64 ms float64 mb float64 Timestamp float64 dtype: object
In [13]:
df.isnull().sum()
Out[13]:
lat 0 long 0 country 0 city 0 area 0 direction 0 dist 12250 depth 0 xm 0 md 0 richter 0 mw 18681 ms 0 mb 0 Timestamp 0 dtype: int64
In [14]:
# Imputing Missing Values with Mean
si=SimpleImputer(missing_values = np.nan, strategy="mean")
si.fit(df[["dist","mw"]])
df[["dist","mw"]] = si.transform(df[["dist","mw"]])
df.isnull().sum()
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\1255669323.py:4: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[["dist","mw"]] = si.transform(df[["dist","mw"]])
Out[14]:
lat 0 long 0 country 0 city 0 area 0 direction 0 dist 0 depth 0 xm 0 md 0 richter 0 mw 0 ms 0 mb 0 Timestamp 0 dtype: int64
Data Visualization
In [15]:
import plotly.express as px
px.scatter(df, x='richter',y='xm', color="direction")
In [16]:
plt.figure(figsize=(7,7))
sns.histplot(data=df, x='depth', hue='direction',palette = 'Accent')
plt.show()
In [17]:
plt.figure(figsize=(7,7))
df[['lat','long']].hist()
plt.show()
<Figure size 700x700 with 0 Axes>
In [19]:
plt.figure(figsize=(10,10))
sns.distplot(df.xm)
C:\Users\arjun\AppData\Local\Temp\ipykernel_18248\3879995472.py:2: UserWarning: `distplot` is a deprecated function and will be removed in seaborn v0.14.0. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). For a guide to updating your code to use the new functions, please see https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751
Out[19]:
<Axes: xlabel='xm', ylabel='Density'>
In [20]:
plt.figure(figsize=(15,10))
sns.barplot(x=df['xm'], y=df['ms'])
plt.xlabel('xm')
plt.ylabel('ms')
Out[20]:
Text(0, 0.5, 'ms')
In [21]:
plt.scatter(df.depth, df.xm)
plt.xlabel("Depth")
plt.ylabel("xm")
plt.show()
In [22]:
plt.scatter(df.depth, df.mb)
plt.xlabel("Depth")
plt.ylabel("Magnitude body")
plt.show()
In [23]:
plt.scatter(df.dist, df.depth)
plt.xlabel("Area affected")
plt.ylabel("Depth")
plt.show()
In [24]:
plt.scatter(df.dist, df.depth)
plt.xlabel("Area affected")
plt.ylabel("xm")
plt.show()
Correlation between Attributes
In [25]:
most_correlated = df.corr()['xm'].sort_values(ascending=False)
most_correlated
Out[25]:
xm 1.000000 mb 0.520944 ms 0.466182 richter 0.279865 mw 0.279404 depth 0.248506 area 0.111337 city 0.104095 long 0.091117 direction 0.085171 dist 0.001002 lat -0.043337 country -0.045640 md -0.101287 Timestamp -0.194235 Name: xm, dtype: float64
In [26]:
plt.figure(figsize=(20,20))
dataplot=sns.heatmap(df.corr(),annot=True)
plt.show()
Normalization of data
In [27]:
# Using MinMaxScaler
scaler = preprocessing.MinMaxScaler()
d = scaler.fit_transform(df)
df = pd.DataFrame(d, columns=df.columns)
df.head()
Out[27]:
| lat | long | country | city | area | direction | dist | depth | xm | md | richter | mw | ms | mb | Timestamp | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.559904 | 0.743088 | 0.76 | 0.173913 | 0.116178 | 0.875 | 0.0 | 0.048077 | 0.150 | 0.554054 | 0.000000 | 0.553294 | 0.0 | 0.000000 | 0.701575 |
| 1 | 0.665262 | 0.396156 | 0.76 | 0.608696 | 0.132893 | 0.875 | 0.0 | 0.025000 | 0.125 | 0.513514 | 0.571429 | 0.553294 | 0.0 | 0.000000 | 0.789847 |
| 2 | 0.532210 | 0.312542 | 0.76 | 0.673913 | 0.459761 | 0.750 | 0.0 | 0.000000 | 0.050 | 0.000000 | 0.000000 | 0.553294 | 0.0 | 0.544118 | 0.175393 |
| 3 | 0.585792 | 0.610249 | 0.76 | 0.869565 | 0.511762 | 0.750 | 0.0 | 0.048077 | 0.000 | 0.472973 | 0.000000 | 0.553294 | 0.0 | 0.000000 | 0.572101 |
| 4 | 0.665864 | 0.401214 | 0.76 | 0.804348 | 0.688609 | 0.750 | 0.0 | 0.033654 | 0.200 | 0.581081 | 0.000000 | 0.553294 | 0.0 | 0.000000 | 0.635804 |
Splitting the Dataset
In [28]:
y=np.array(df['xm'])
X=np.array(df.drop('xm',axis=1))
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=2)
Creating Models
- Linear Regression
In [425]:
from sklearn.linear_model import LinearRegression
start1 = time.time()
linear=LinearRegression()
linear.fit(X_train,y_train)
ans1 = linear.predict(X_test)
end1 = time.time()
t1 = end1-start1
In [426]:
accuracy1=linear.score(X_test,y_test)
print("Accuracy of Linear Regression model is:",accuracy1)
Accuracy of Linear Regression model is: 0.63134131503029
In [427]:
from sklearn import metrics
print("Linear Regression")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ans1))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ans1))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ans1)))
Linear Regression Mean Absolute Error: 0.05878246463205686 Mean Squared Error: 0.00625827169726636 Root Mean Squared Error: 0.07910923901331854
In [428]:
plt.plot(y_test, ans1, 'o')
m, b = np.polyfit(y_test,ans1, 1)
plt.plot(y_test, m*y_test + b)
plt.xlabel("Actual Magnitude")
plt.ylabel("Predicted Magnitude")
Out[428]:
Text(0, 0.5, 'Predicted Magnitude')
- Decision Tree
In [449]:
from sklearn.tree import DecisionTreeRegressor
start2 = time.time()
regressor = DecisionTreeRegressor(random_state = 40)
regressor.fit(X_train,y_train)
ans2 = regressor.predict(X_test)
end2 = time.time()
t2 = end2-start2
In [450]:
accuracy2=regressor.score(X_test,y_test)
print("Accuracy of Decision Tree model is:",accuracy2)
Accuracy of Decision Tree model is: 0.9932960893884235
In [451]:
print("Decision Tree")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ans2))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ans2))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ans2)))
Decision Tree Mean Absolute Error: 0.0006909999621372331 Mean Squared Error: 0.00011380416561969702 Root Mean Squared Error: 0.010667903525046383
- KNN Model
In [432]:
from sklearn.neighbors import KNeighborsRegressor
start3 = time.time()
knn = KNeighborsRegressor(n_neighbors=6)
knn.fit(X_train, y_train)
ans3 = knn.predict(X_test)
end3 = time.time()
t3 = end3-start3
In [433]:
accuracy3=knn.score(X_test,y_test)
print("Accuracy of KNN model is:",accuracy3)
Accuracy of KNN model is: 0.8457466919393031
In [434]:
print("KNN Model")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, ans3))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, ans3))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, ans3)))
KNN Model Mean Absolute Error: 0.03305598677318794 Mean Squared Error: 0.002618571462992348 Root Mean Squared Error: 0.051171979275696854
In [435]:
import random
info = {}
for i in range(10):
k = random.randint(2,10)
startk = time.time()
knn = KNeighborsRegressor(n_neighbors=k)
knn.fit(X_train, y_train)
ans3 = knn.predict(X_test)
endk = time.time()
tk = endk-startk
acc3=knn.score(X_test,y_test)
info[k] = [acc3,tk]
for i in info:
print("for k =",i,": accuracy =",info[i][0])
for k = 4 : accuracy = 0.8559118607470738 for k = 9 : accuracy = 0.8334625255508568 for k = 8 : accuracy = 0.8384577534478264 for k = 6 : accuracy = 0.8457466919393031 for k = 5 : accuracy = 0.8519381145638621 for k = 10 : accuracy = 0.8296048410841246 for k = 7 : accuracy = 0.8425261199362686
In [436]:
x = list(info.keys())
yacc = []
for i in info:
yacc.append(info[i][0])
plt.plot(x, yacc, 'o', color='black');
plt.xlabel("k value")
plt.ylabel("accuracy");
plt.title("Accuracy for different values of k")
Out[436]:
Text(0.5, 1.0, 'Accuracy for different values of k')
In [437]:
yt = []
for i in info:
yt.append(info[i][1])
plt.plot(x, yt, 'o', color='black');
plt.xlabel("k value")
plt.ylabel("execution time");
plt.title("Execution time for different values of k")
Out[437]:
Text(0.5, 1.0, 'Execution time for different values of k')
Comparison Graphs
- Accuracy
In [454]:
models = ["linear regression","decision tree","knn"]
accuracies = [accuracy1,accuracy2,accuracy3]
In [455]:
plt.bar(models, accuracies, color ='maroon',
width = 0.25)
plt.xlabel("Models")
plt.ylabel("Accuracies")
plt.title("Accuracy Comparison Graph")
Out[455]:
Text(0.5, 1.0, 'Accuracy Comparison Graph')
- Execution Time
In [456]:
times = [t1,t2,t3]
plt.bar(models, times, color ='maroon',
width = 0.25)
plt.xlabel("Models")
plt.ylabel("Execution Time")
plt.title("Execution Time Comparison Graph")
Out[456]:
Text(0.5, 1.0, 'Execution Time Comparison Graph')
In [ ]: